Raw data

- GEO number: GSE128615

- Study summary: To determine the influence of differential Kdm6a expression in immune cells, whole transcriptome analysis for CD4+ T cells from WT and Kdm6a cKO mice were performed using RNA-Seq.

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(gridExtra)
library(pheatmap)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)

Setting AnnotationHub

DB <- "EnsDb"  # Assing your species
AnnotationSpecies <- "mus musculus"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))   # Connect to annotation DB

Running AnnotationHub

# Filter annotation of interest
ahQuery <- query(ah, 
                 pattern=c(DB, AnnotationSpecies), 
                 ignore.case=T)      


# Select the most recent data
DBName <- mcols(ahQuery) %>%
    rownames() %>%
    tail(1)

AnnoDb <- ah[[DBName]] 

# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")

# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="TXID",
                 columns=c("GENEID", "GENENAME")) 



# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
##                 TXID             GENEID GENENAME
## 1 ENSMUST00000000001 ENSMUSG00000000001    Gnai3
## 2 ENSMUST00000000003 ENSMUSG00000000003     Pbsn
## 3 ENSMUST00000000010 ENSMUSG00000020875    Hoxb9
## 4 ENSMUST00000000028 ENSMUSG00000000028    Cdc45
## 5 ENSMUST00000000033 ENSMUSG00000048583     Igf2
## 6 ENSMUST00000000049 ENSMUSG00000000049     Apoh

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
#
# Define sample names 
SampleNames <-  c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3)) 

# Define group level
GroupLevel <- c("WT", "cKO")

# Define contrast for DE analysis
Contrast <- c("Group", "cKO", "WT")


# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC

# Define .sf file path
sf <- c(paste0("../salmon_output/", 
               SampleNames,
               ".salmon_quant/quant.sf"))

# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)

rownames(metadata) <- SampleNames

# Explore the metadata
print(metadata)
##            Sample Group                                            Path
## WT-rep1   WT-rep1    WT  ../salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2   WT-rep2    WT  ../salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3   WT-rep3    WT  ../salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1   cKO ../salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2   cKO ../salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3   cKO ../salmon_output/cKO-rep3.salmon_quant/quant.sf

Converting .sf files to txi list

- txi_tpm: stores TPM with the argument “countsFromAbundance=”lengthScaledTPM"

- txi_counts: stores original counts

- In this project, two txi objects were created with or without the countsFromAbundance=“lengthScaledTPM” argument and compared in downstream DE analysis.

- If you don’t want gene-level summarization, set txOut=TRUE.

- References: tximport doc, DESeq2 doc “Why unnormalized counts?”, Soneson et al. 2016, Developer Dr. Love’s comment, Harvard Chan Bioinformatics Core workshop

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames

# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 

txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

# tpm 
head(txi_tpm$counts)
##                       WT-rep1    WT-rep2    WT-rep3     cKO-rep1    cKO-rep2
## ENSMUSG00000000001 6572.14450 5800.14784 5642.53666 5996.0868676 6077.604244
## ENSMUSG00000000003    0.00000    0.00000    0.00000    0.0000000    0.000000
## ENSMUSG00000000028 3917.45158 3550.56247 3714.75413 3422.1658384 3407.098630
## ENSMUSG00000000031   25.53275   61.51501   45.35886    9.5023217   38.694556
## ENSMUSG00000000037   85.32325   75.78921  107.66451  138.5489706  170.156259
## ENSMUSG00000000049    0.00000    0.00000    0.00000    0.9982156    1.995162
##                      cKO-rep3
## ENSMUSG00000000001 5349.93335
## ENSMUSG00000000003    0.00000
## ENSMUSG00000000028 3121.96322
## ENSMUSG00000000031   95.50979
## ENSMUSG00000000037  137.21541
## ENSMUSG00000000049    0.00000
dim(txi_tpm$counts)
## [1] 51956     6
# counts
head(txi_counts$counts)
##                     WT-rep1  WT-rep2  WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492.000 5822.000 5614.000  6040.00  6142.00 5437.000
## ENSMUSG00000000003    0.000    0.000    0.000     0.00     0.00    0.000
## ENSMUSG00000000028 3904.066 3567.989 3714.192  3410.46  3463.04 3142.454
## ENSMUSG00000000031   33.000   45.000   58.000    12.00    28.00   70.000
## ENSMUSG00000000037  119.001   90.001   82.000   165.00   129.00  100.000
## ENSMUSG00000000049    0.000    0.000    0.000     1.00     2.00    0.000
dim(txi_counts$counts)
## [1] 51956     6

Creating DESeq objects from txi and VST

- Note: The tximport-to-DESeq2 approach uses estimated gene counts from the transcript abundance quantifiers, but not normalized counts.

- The DESeqDataSetFromTximport() function generated an DESeq object (aka dds) with the txi input.

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 

    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)

    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)

    # Output them as a list 
    return(list(dds=des, vsd=ves))

}

TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 51956 6 
## metadata(1): version
## assays(1): counts
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##                    WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001    6572    5800    5643     5996     6078     5350
## ENSMUSG00000000003       0       0       0        0        0        0
## ENSMUSG00000000028    3917    3551    3715     3422     3407     3122
## ENSMUSG00000000031      26      62      45       10       39       96
## ENSMUSG00000000037      85      76     108      139      170      137
## ENSMUSG00000000049       0       0       0        1        2        0
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 51956 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##                    WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001    6492    5822    5614     6040     6142     5437
## ENSMUSG00000000003       0       0       0        0        0        0
## ENSMUSG00000000028    3904    3568    3714     3410     3463     3142
## ENSMUSG00000000031      33      45      58       12       28       70
## ENSMUSG00000000037     119      90      82      165      129      100
## ENSMUSG00000000049       0       0       0        1        2        0

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 51956 6 
## metadata(1): version
## assays(1): ''
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 51956 6 
## metadata(1): version
## assays(1): ''
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- Messages when “Counts <- DESeqPrep_fn(Counts)” was run: using ‘avgTxLength’ from assays(dds)

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##   WT-rep1   WT-rep2   WT-rep3  cKO-rep1  cKO-rep2  cKO-rep3 
## 1.1099245 0.9997388 0.9771411 1.0203120 1.0318653 0.8935969
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##            Sample    Group                   Path
##          <factor> <factor>            <character>
## WT-rep1  WT-rep1       WT  ../salmon_output/WT-..
## WT-rep2  WT-rep2       WT  ../salmon_output/WT-..
## WT-rep3  WT-rep3       WT  ../salmon_output/WT-..
## cKO-rep1 cKO-rep1      cKO ../salmon_output/cKO..
## cKO-rep2 cKO-rep2      cKO ../salmon_output/cKO..
## cKO-rep3 cKO-rep3      cKO ../salmon_output/cKO..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##            Sample    Group                   Path sizeFactor
##          <factor> <factor>            <character>  <numeric>
## WT-rep1  WT-rep1       WT  ../salmon_output/WT-..   1.109925
## WT-rep2  WT-rep2       WT  ../salmon_output/WT-..   0.999739
## WT-rep3  WT-rep3       WT  ../salmon_output/WT-..   0.977141
## cKO-rep1 cKO-rep1      cKO ../salmon_output/cKO..   1.020312
## cKO-rep2 cKO-rep2      cKO ../salmon_output/cKO..   1.031865
## cKO-rep3 cKO-rep3      cKO ../salmon_output/cKO..   0.893597

Plotting the size factors in TPM

- The size factors are only available from TPM dds

- Blue dashed line: normalization factor = 1

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))

colnames(sizeFactor) <- 'Size_Factor'

sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 

sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)



# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")

Plotting nornalization factors in the Counts

- DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples. The counts for a gene in each sample is then divided by this mean. The median of these ratios in a sample is the size factor for that sample.

- Blue dashed line: normalization factor = 1

- Colored dots: normlization factors per gene (y-axis) in each sample

- Box plots: distribution of the normalization facters in each group (see the second plot)

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA, alpha=0.5) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.8, 1.2)

Setting functions for QC plots

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1] 

# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {

    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}

# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]

# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {

    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd

    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis (PCA)

- Checkpoints in PCA: source of variation, sample outlier

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

- Checkpoints of correlation heatmap: distance between samples, correlation in a group

- Upper: TPM input

- Lower: Counts input

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Running DE analysis with or without shrinkage

- Shrinkage reduces false positives

- Magnitude of shrinkage is affected by dispersion and sample size

- When the argument type is set to “apeglm”, the coef argument is used instead of constrast. In this dataset, you can set coef=Coef where Coef <- resultsNames(ddsList[[1]]).

- When the type is set to “normal”, the argument contrast is set as shown below.

- References: DESeq2 doc “Alternative shrinkage estimators”, Harvard Chan Bioinformatics Core workshop

# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]]) 
names(desList) <- TvC

# Create a list for TPM and Counts dds 
ddsList <- desList  # DE without shrinkage  
normal.ddsList <- desList    # DE with "normal" shrinkage
ape.ddsList <- desList       # DE with "apeglm" shrinkage
ash.ddsList <- desList       # DE with "ashr" shrinkage

for (x in TvC) {
    
    # Run DESeq() 
    ddsList[[x]] <- DESeq(desList[[x]])
    print(resultsNames(ddsList[[x]]))

    normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                                contrast=Contrast,
                                type="normal")

    ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="apeglm")

    ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="ashr")

}
## [1] "Intercept"       "Group_cKO_vs_WT"
## [1] "Intercept"       "Group_cKO_vs_WT"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Plot dispersion  
for (x in TvC) {

    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Extracting DE results with or without shrinkage

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold 
alpha=0.1 

# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 

# Initialize lists of result tables 
all.resList <- all.ddsList 

# Set a function cleaning table
Sig.fn <- function(df, Input) {
    
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}




for (i in shrinkage) {

    if (i == "None") {

        for (x in TvC) {

        # Extract data frames from unshrunken lfc & clean data 
        all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]], 
                                                       contrast=Contrast, 
                                                       alpha=alpha)) %>% 
        Sig.fn(x)

         } 
    } else {

        # Extract data frames from shrunken lfc & clean data
        for (x in TvC) {

            all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
                Sig.fn(x)
    

        }
    }
}





# Explore the results 
summary(all.resList)
##        Length Class  Mode
## None   2      -none- list
## Normal 2      -none- list
## Apeglm 2      -none- list
## Ashr   2      -none- list
head(all.resList[[1]][['TPM']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5875.2710822      0.0208662 0.05803844  0.3595238
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3505.3855243     -0.1007121 0.07316717 -1.3764651
## 4 ENSMUSG00000000031   47.7535803      0.2351843 0.73316295  0.3207804
## 5 ENSMUSG00000000037  119.5706931      0.7900807 0.24416906  3.2357939
## 6 ENSMUSG00000000049    0.4863883      2.4485413 3.94664218  0.6204113
##        pvalue       padj   FDR Input
## 1 0.719203270 0.94846055 > 0.1   TPM
## 2          NA         NA > 0.1   TPM
## 3 0.168677681 0.60188898 > 0.1   TPM
## 4 0.748376798 0.95470956 > 0.1   TPM
## 5 0.001213049 0.03148688 < 0.1   TPM
## 6 0.534987059         NA > 0.1   TPM
head(all.resList[[1]][['Counts']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5914.7570322     0.02115354 0.05853484  0.3613837
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3528.9626791    -0.10046859 0.07347464 -1.3673913
## 4 ENSMUSG00000000031   45.9582159     0.22318728 0.74187581  0.3008418
## 5 ENSMUSG00000000037  116.2530810     0.80391955 0.24247966  3.3154103
## 6 ENSMUSG00000000049    0.4886776     2.45554210 3.85387491  0.6371619
##         pvalue       padj   FDR  Input
## 1 0.7178126394 0.94593163 > 0.1 Counts
## 2           NA         NA > 0.1 Counts
## 3 0.1715026994 0.60733329 > 0.1 Counts
## 4 0.7635351064 0.95710414 > 0.1 Counts
## 5 0.0009150871 0.02512149 < 0.1 Counts
## 6 0.5240194188         NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5875.2710822     0.02033329 0.05655623  0.3595238
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3505.3855243    -0.09668461 0.07024128 -1.3764651
## 4 ENSMUSG00000000031   47.7535803     0.04527843 0.14148646  0.3207804
## 5 ENSMUSG00000000037  119.5706931     0.53955719 0.16664524  3.2357939
## 6 ENSMUSG00000000049    0.4863883     0.02254895 0.03170670  0.6204113
##        pvalue       padj   FDR Input
## 1 0.719203270 0.94846055 > 0.1   TPM
## 2          NA         NA > 0.1   TPM
## 3 0.168677681 0.60188898 > 0.1   TPM
## 4 0.748376798 0.95470956 > 0.1   TPM
## 5 0.001213049 0.03148688 < 0.1   TPM
## 6 0.534987059         NA > 0.1   TPM
head(all.resList[[2]][['Counts']])
##                 Gene     baseMean log2FoldChange      lfcSE       stat
## 1 ENSMUSG00000000001 5914.7570322     0.02060873 0.05702735  0.3613837
## 2 ENSMUSG00000000003    0.0000000             NA         NA         NA
## 3 ENSMUSG00000000028 3528.9626791    -0.09645101 0.07053657 -1.3673913
## 4 ENSMUSG00000000031   45.9582159     0.04223396 0.14140232  0.3008418
## 5 ENSMUSG00000000037  116.2530810     0.55189675 0.16671595  3.3154103
## 6 ENSMUSG00000000049    0.4886776     0.02385755 0.03270373  0.6371619
##         pvalue       padj   FDR  Input
## 1 0.7178126394 0.94593163 > 0.1 Counts
## 2           NA         NA > 0.1 Counts
## 3 0.1715026994 0.60733329 > 0.1 Counts
## 4 0.7635351064 0.95710414 > 0.1 Counts
## 5 0.0009150871 0.02512149 < 0.1 Counts
## 6 0.5240194188         NA > 0.1 Counts

Exploring mean-difference relationship with MA plots

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

- Reference: DESeq2 doc “MA-plot”

# Set ylim: has to adjusted by users depending on data 
yl <- c(-7.5, 7.5)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Initialize a list storing MA plots
MAList <- ddsList


# Create MA plots

for (i in shrinkage) {

    both.df <- rbind(all.resList[[i]][[TvC[1]]], 
                     all.resList[[i]][[TvC[2]]])

    MAList[[i]] <- ggplot(both.df, 
                              aes(x=baseMean, y=log2FoldChange, color=FDR))  +geom_point() + scale_x_log10() + facet_grid(~Input) + 
                                   theme_bw() + 
                                   scale_color_manual(values=c("blue", "grey")) +
                                   ggtitle(paste("MA plot with", i)) + 
                                   ylim(yl[1], yl[2]) + 
                                   geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red") 

}

   

# Print MA plots
MAList
## $TPM
## class: DESeqDataSet 
## dim: 51956 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
## 
## $Counts
## class: DESeqDataSet 
## dim: 51956 6 
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(51956): ENSMUSG00000000001 ENSMUSG00000000003 ...
##   ENSMUSG00001118662 ENSMUSG00001118663
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
## 
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

Exploring distribution of false discovery rate (FDR)

- Distribution of adjusted p-val (FDR) was presented

- x-axis: FDR

- y-axis: Number of genes

- Black dashed line: FDR = alpha

# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']], 
                 all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)

# Plot distribution of FDR
ggplot(both.df,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") + 
xlab("Adjusted P-Value (FDR)") + 
ylab("Count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Exploring distribution of log2FoldChange by input type

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

# Initialize a lfc list
lfcplotList <- all.resList 

# Create lfc distribution plots
for (i in shrinkage) {

    lfc.df <- rbind(all.resList[[i]][[TvC[1]]], 
                    all.resList[[i]][[TvC[2]]])

    lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]

    lfc.df$Input <- factor(lfc.df$Input, levels=TvC)

    lfcplotList[[i]] <- ggplot(lfc.df,  # Subset rows with FDR < alpha
                               aes(x=log2FoldChange, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() + ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black", 
           linetype="dashed", 
           size=1) + 
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) + 
xlim(-5, 5)


}

# Print the lfc plots
lfcplotList
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 


# Create a data frame storing NA gene number
NAstat <- both.df %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))

# Plot number of NA genes 
ggplot(NAstat, 
       aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

baseMean/LFC/FDR comparison between TPM and Count inputs

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(GENEID) %>% 
    summarize(num.trans=n_distinct(TXID))

# Create an empty list storing significant gene lfc tables 
sigList <- list() 


# Filter significant genes' lfc and save in the list 
for (x in shrinkage) {

    for (y in TvC) {

        # Subset genes with FDR below alpha
        sigList[[x]][[y]] <- subset(all.resList[[x]][[y]], 
                                    FDR == paste("<", alpha))

        # Explore the output 
        print(head(sigList[[x]][[y]]))
        print(dim(sigList[[x]][[y]]))

    }
}
##                  Gene    baseMean log2FoldChange      lfcSE      stat
## 5  ENSMUSG00000000037   119.57069      0.7900807 0.24416906  3.235794
## 16 ENSMUSG00000000125    32.58857      1.6829572 0.58565055  2.873654
## 49 ENSMUSG00000000278  2045.98646      0.3824490 0.09661649  3.958423
## 55 ENSMUSG00000000303    17.85736     -1.9802205 0.61593594 -3.214978
## 57 ENSMUSG00000000308   176.27983      0.8913243 0.23443336  3.802037
## 78 ENSMUSG00000000409 16477.95193     -0.2448023 0.06132521 -3.991871
##          pvalue        padj   FDR Input
## 5  1.213049e-03 0.031486877 < 0.1   TPM
## 16 4.057528e-03 0.072140679 < 0.1   TPM
## 49 7.544613e-05 0.003849140 < 0.1   TPM
## 55 1.304545e-03 0.033524391 < 0.1   TPM
## 57 1.435114e-04 0.006551012 < 0.1   TPM
## 78 6.555411e-05 0.003512334 < 0.1   TPM
## [1] 927   9
##                  Gene    baseMean log2FoldChange      lfcSE      stat
## 5  ENSMUSG00000000037   116.25308      0.8039195 0.24247966  3.315410
## 16 ENSMUSG00000000125    32.76783      1.6985679 0.57789160  2.939250
## 49 ENSMUSG00000000278  2059.81139      0.3823293 0.09661858  3.957099
## 55 ENSMUSG00000000303    17.90961     -1.9770953 0.60846497 -3.249317
## 57 ENSMUSG00000000308   177.45680      0.8927097 0.23256220  3.838585
## 78 ENSMUSG00000000409 16587.89462     -0.2445521 0.06193397 -3.948593
##          pvalue        padj   FDR  Input
## 5  9.150871e-04 0.025121492 < 0.1 Counts
## 16 3.290075e-03 0.062778062 < 0.1 Counts
## 49 7.586541e-05 0.003860243 < 0.1 Counts
## 55 1.156826e-03 0.030092651 < 0.1 Counts
## 57 1.237455e-04 0.005807258 < 0.1 Counts
## 78 7.861174e-05 0.003985386 < 0.1 Counts
## [1] 943   9
##                  Gene    baseMean log2FoldChange      lfcSE      stat
## 5  ENSMUSG00000000037   119.57069      0.5395572 0.16664524  3.235794
## 16 ENSMUSG00000000125    32.58857      0.4491230 0.16093685  2.873654
## 49 ENSMUSG00000000278  2045.98646      0.3565501 0.09007159  3.958423
## 55 ENSMUSG00000000303    17.85736     -0.4992870 0.15972135 -3.214978
## 57 ENSMUSG00000000308   176.27983      0.6242894 0.16406998  3.802037
## 78 ENSMUSG00000000409 16477.95193     -0.2378428 0.05958165 -3.991871
##          pvalue        padj   FDR Input
## 5  1.213049e-03 0.031486877 < 0.1   TPM
## 16 4.057528e-03 0.072140679 < 0.1   TPM
## 49 7.544613e-05 0.003849140 < 0.1   TPM
## 55 1.304545e-03 0.033524391 < 0.1   TPM
## 57 1.435114e-04 0.006551012 < 0.1   TPM
## 78 6.555411e-05 0.003512334 < 0.1   TPM
## [1] 927   9
##                  Gene    baseMean log2FoldChange      lfcSE      stat
## 5  ENSMUSG00000000037   116.25308      0.5518968 0.16671595  3.315410
## 16 ENSMUSG00000000125    32.76783      0.4653339 0.16288391  2.939250
## 49 ENSMUSG00000000278  2059.81139      0.3566400 0.09012448  3.957099
## 55 ENSMUSG00000000303    17.90961     -0.5117566 0.16164889 -3.249317
## 57 ENSMUSG00000000308   177.45680      0.6298021 0.16394142  3.838585
## 78 ENSMUSG00000000409 16587.89462     -0.2375228 0.06015364 -3.948593
##          pvalue        padj   FDR  Input
## 5  9.150871e-04 0.025121492 < 0.1 Counts
## 16 3.290075e-03 0.062778062 < 0.1 Counts
## 49 7.586541e-05 0.003860243 < 0.1 Counts
## 55 1.156826e-03 0.030092651 < 0.1 Counts
## 57 1.237455e-04 0.005807258 < 0.1 Counts
## 78 7.861174e-05 0.003985386 < 0.1 Counts
## [1] 943   9
##                  Gene    baseMean log2FoldChange      lfcSE       pvalue
## 5  ENSMUSG00000000037   119.57069     0.60077253 0.28973994 1.213049e-03
## 16 ENSMUSG00000000125    32.58857     1.01605413 1.00001036 4.057528e-03
## 49 ENSMUSG00000000278  2045.98646     0.33381180 0.10234699 7.544613e-05
## 55 ENSMUSG00000000303    17.85736    -0.04631661 0.10830675 1.304545e-03
## 57 ENSMUSG00000000308   176.27983     0.74965061 0.25875052 1.435114e-04
## 78 ENSMUSG00000000409 16477.95193    -0.23092435 0.06322695 6.555411e-05
##           padj   FDR Input
## 5  0.031486877 < 0.1   TPM
## 16 0.072140679 < 0.1   TPM
## 49 0.003849140 < 0.1   TPM
## 55 0.033524391 < 0.1   TPM
## 57 0.006551012 < 0.1   TPM
## 78 0.003512334 < 0.1   TPM
## [1] 927   8
##                  Gene    baseMean log2FoldChange      lfcSE       pvalue
## 5  ENSMUSG00000000037   116.25308     0.62198670 0.28462912 9.150871e-04
## 16 ENSMUSG00000000125    32.76783     1.09745828 0.86109790 3.290075e-03
## 49 ENSMUSG00000000278  2059.81139     0.33379511 0.10230443 7.586541e-05
## 55 ENSMUSG00000000303    17.90961    -0.04878957 0.11100468 1.156826e-03
## 57 ENSMUSG00000000308   177.45680     0.75437557 0.25589513 1.237455e-04
## 78 ENSMUSG00000000409 16587.89462    -0.21981293 0.06380161 7.861174e-05
##           padj   FDR  Input
## 5  0.025121492 < 0.1 Counts
## 16 0.062778062 < 0.1 Counts
## 49 0.003860243 < 0.1 Counts
## 55 0.030092651 < 0.1 Counts
## 57 0.005807258 < 0.1 Counts
## 78 0.003985386 < 0.1 Counts
## [1] 943   8
##                  Gene    baseMean log2FoldChange      lfcSE       pvalue
## 5  ENSMUSG00000000037   119.57069      0.4397207 0.32998090 1.213049e-03
## 16 ENSMUSG00000000125    32.58857      0.3244977 0.51639717 4.057528e-03
## 49 ENSMUSG00000000278  2045.98646      0.3099511 0.10336092 7.544613e-05
## 55 ENSMUSG00000000303    17.85736     -0.5020908 0.64094898 1.304545e-03
## 57 ENSMUSG00000000308   176.27983      0.6701575 0.30770701 1.435114e-04
## 78 ENSMUSG00000000409 16477.95193     -0.2151323 0.06143825 6.555411e-05
##           padj   FDR Input
## 5  0.031486877 < 0.1   TPM
## 16 0.072140679 < 0.1   TPM
## 49 0.003849140 < 0.1   TPM
## 55 0.033524391 < 0.1   TPM
## 57 0.006551012 < 0.1   TPM
## 78 0.003512334 < 0.1   TPM
## [1] 927   8
##                  Gene    baseMean log2FoldChange      lfcSE       pvalue
## 5  ENSMUSG00000000037   116.25308      0.4743945 0.33019341 9.150871e-04
## 16 ENSMUSG00000000125    32.76783      0.3608942 0.53978349 3.290075e-03
## 49 ENSMUSG00000000278  2059.81139      0.3107556 0.10319337 7.586541e-05
## 55 ENSMUSG00000000303    17.90961     -0.5333992 0.65247640 1.156826e-03
## 57 ENSMUSG00000000308   177.45680      0.6797485 0.30193197 1.237455e-04
## 78 ENSMUSG00000000409 16587.89462     -0.2146729 0.06232476 7.861174e-05
##           padj   FDR  Input
## 5  0.025121492 < 0.1 Counts
## 16 0.062778062 < 0.1 Counts
## 49 0.003860243 < 0.1 Counts
## 55 0.030092651 < 0.1 Counts
## 57 0.005807258 < 0.1 Counts
## 78 0.003985386 < 0.1 Counts
## [1] 943   8
# Clean the data frames with renaming columns
for (x in shrinkage) {

    # Join TPM and Counts tables by GENEID
    df <- inner_join(sigList[[x]][[1]], 
                     sigList[[x]][[2]],
                     by="Gene") 

    # Create a vector storing original column nams 
    my.colname1 <- colnames(sigList[[x]][[1]])[-1]

    # Create a vector storing modified column names
    my.colname2 <- c("GENEID", 
                     paste0(my.colname1, "_", TvC[1]), 
                     paste0(my.colname1, "_", TvC[2]))

    # Rename the columns
    colnames(df) <- my.colname2

    # Add a column storing shrinkage method and drop redundant columns
    df <- df %>% 
        mutate(Shrinkage = x) %>% 
        dplyr::select(-starts_with(c("lfcSE", "stat", "FDR")))

    # Save the cleaned data frame in the list
    sigList[[x]] <- df

    # Explore the output data frame
    print(head(df))
    print(dim(df))
}
##               GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM    padj_TPM
## 1 ENSMUSG00000000037    119.57069          0.7900807 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125     32.58857          1.6829572 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278   2045.98646          0.3824490 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303     17.85736         -1.9802205 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308    176.27983          0.8913243 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409  16477.95193         -0.2448023 6.555411e-05 0.003512334
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1       TPM       116.25308             0.8039195  9.150871e-04 0.025121492
## 2       TPM        32.76783             1.6985679  3.290075e-03 0.062778062
## 3       TPM      2059.81139             0.3823293  7.586541e-05 0.003860243
## 4       TPM        17.90961            -1.9770953  1.156826e-03 0.030092651
## 5       TPM       177.45680             0.8927097  1.237455e-04 0.005807258
## 6       TPM     16587.89462            -0.2445521  7.861174e-05 0.003985386
##   Input_Counts Shrinkage
## 1       Counts      None
## 2       Counts      None
## 3       Counts      None
## 4       Counts      None
## 5       Counts      None
## 6       Counts      None
## [1] 915  12
##               GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM    padj_TPM
## 1 ENSMUSG00000000037    119.57069          0.5395572 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125     32.58857          0.4491230 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278   2045.98646          0.3565501 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303     17.85736         -0.4992870 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308    176.27983          0.6242894 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409  16477.95193         -0.2378428 6.555411e-05 0.003512334
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1       TPM       116.25308             0.5518968  9.150871e-04 0.025121492
## 2       TPM        32.76783             0.4653339  3.290075e-03 0.062778062
## 3       TPM      2059.81139             0.3566400  7.586541e-05 0.003860243
## 4       TPM        17.90961            -0.5117566  1.156826e-03 0.030092651
## 5       TPM       177.45680             0.6298021  1.237455e-04 0.005807258
## 6       TPM     16587.89462            -0.2375228  7.861174e-05 0.003985386
##   Input_Counts Shrinkage
## 1       Counts    Normal
## 2       Counts    Normal
## 3       Counts    Normal
## 4       Counts    Normal
## 5       Counts    Normal
## 6       Counts    Normal
## [1] 915  12
##               GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM    padj_TPM
## 1 ENSMUSG00000000037    119.57069         0.60077253 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125     32.58857         1.01605413 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278   2045.98646         0.33381180 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303     17.85736        -0.04631661 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308    176.27983         0.74965061 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409  16477.95193        -0.23092435 6.555411e-05 0.003512334
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1       TPM       116.25308            0.62198670  9.150871e-04 0.025121492
## 2       TPM        32.76783            1.09745828  3.290075e-03 0.062778062
## 3       TPM      2059.81139            0.33379511  7.586541e-05 0.003860243
## 4       TPM        17.90961           -0.04878957  1.156826e-03 0.030092651
## 5       TPM       177.45680            0.75437557  1.237455e-04 0.005807258
## 6       TPM     16587.89462           -0.21981293  7.861174e-05 0.003985386
##   Input_Counts Shrinkage
## 1       Counts    Apeglm
## 2       Counts    Apeglm
## 3       Counts    Apeglm
## 4       Counts    Apeglm
## 5       Counts    Apeglm
## 6       Counts    Apeglm
## [1] 915  12
##               GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM    padj_TPM
## 1 ENSMUSG00000000037    119.57069          0.4397207 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125     32.58857          0.3244977 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278   2045.98646          0.3099511 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303     17.85736         -0.5020908 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308    176.27983          0.6701575 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409  16477.95193         -0.2151323 6.555411e-05 0.003512334
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1       TPM       116.25308             0.4743945  9.150871e-04 0.025121492
## 2       TPM        32.76783             0.3608942  3.290075e-03 0.062778062
## 3       TPM      2059.81139             0.3107556  7.586541e-05 0.003860243
## 4       TPM        17.90961            -0.5333992  1.156826e-03 0.030092651
## 5       TPM       177.45680             0.6797485  1.237455e-04 0.005807258
## 6       TPM     16587.89462            -0.2146729  7.861174e-05 0.003985386
##   Input_Counts Shrinkage
## 1       Counts      Ashr
## 2       Counts      Ashr
## 3       Counts      Ashr
## 4       Counts      Ashr
## 5       Counts      Ashr
## 6       Counts      Ashr
## [1] 915  12
# Convert a list of data frames to one single data frame 
lfcTable <- sigList[[1]] 

for (i in 2:length(shrinkage)) {

    lfcTable <- rbind(lfcTable, sigList[[i]])

}

# Explore the output data frame
head(lfcTable)
##               GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM    padj_TPM
## 1 ENSMUSG00000000037    119.57069          0.7900807 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125     32.58857          1.6829572 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278   2045.98646          0.3824490 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303     17.85736         -1.9802205 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308    176.27983          0.8913243 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409  16477.95193         -0.2448023 6.555411e-05 0.003512334
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1       TPM       116.25308             0.8039195  9.150871e-04 0.025121492
## 2       TPM        32.76783             1.6985679  3.290075e-03 0.062778062
## 3       TPM      2059.81139             0.3823293  7.586541e-05 0.003860243
## 4       TPM        17.90961            -1.9770953  1.156826e-03 0.030092651
## 5       TPM       177.45680             0.8927097  1.237455e-04 0.005807258
## 6       TPM     16587.89462            -0.2445521  7.861174e-05 0.003985386
##   Input_Counts Shrinkage
## 1       Counts      None
## 2       Counts      None
## 3       Counts      None
## 4       Counts      None
## 5       Counts      None
## 6       Counts      None
dim(lfcTable)
## [1] 3660   12
# Calculate differences between TPM and Counts input data 
# in baseMean, log2FoldChange, and padj
lfcTable <- lfcTable %>%

    mutate(mean_TC=baseMean_TPM - baseMean_Counts,
           lfc_TC=log2FoldChange_TPM - log2FoldChange_Counts,
           FDR_TC=padj_TPM - padj_Counts) %>%

# Add a column storing the number of alternative transcripts
left_join(AnnoDb.ntrans, by="GENEID")

# Explore the output data frame
head(lfcTable)
##               GENEID baseMean_TPM log2FoldChange_TPM   pvalue_TPM    padj_TPM
## 1 ENSMUSG00000000037    119.57069          0.7900807 1.213049e-03 0.031486877
## 2 ENSMUSG00000000125     32.58857          1.6829572 4.057528e-03 0.072140679
## 3 ENSMUSG00000000278   2045.98646          0.3824490 7.544613e-05 0.003849140
## 4 ENSMUSG00000000303     17.85736         -1.9802205 1.304545e-03 0.033524391
## 5 ENSMUSG00000000308    176.27983          0.8913243 1.435114e-04 0.006551012
## 6 ENSMUSG00000000409  16477.95193         -0.2448023 6.555411e-05 0.003512334
##   Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1       TPM       116.25308             0.8039195  9.150871e-04 0.025121492
## 2       TPM        32.76783             1.6985679  3.290075e-03 0.062778062
## 3       TPM      2059.81139             0.3823293  7.586541e-05 0.003860243
## 4       TPM        17.90961            -1.9770953  1.156826e-03 0.030092651
## 5       TPM       177.45680             0.8927097  1.237455e-04 0.005807258
## 6       TPM     16587.89462            -0.2445521  7.861174e-05 0.003985386
##   Input_Counts Shrinkage       mean_TC        lfc_TC        FDR_TC num.trans
## 1       Counts      None    3.31761219 -0.0138387970  6.365386e-03         9
## 2       Counts      None   -0.17926424 -0.0156107344  9.362617e-03         1
## 3       Counts      None  -13.82492665  0.0001196663 -1.110363e-05         1
## 4       Counts      None   -0.05225284 -0.0031251921  3.431740e-03         3
## 5       Counts      None   -1.17697397 -0.0013854348  7.437539e-04        10
## 6       Counts      None -109.94268956 -0.0002502272 -4.730519e-04         8
dim(lfcTable)
## [1] 3660   16
# Set a function to create a vector storing plot labels
plotlabels.fn <- function(myvec, mylist, num) {

    vec <- c()

    for (i in 1:num) {
        vec <- c(vec, c(myvec[i], rep("", nrow(mylist[[i]]) - 1)))
    }

    return(vec)

}
my.param <- c("baseMean", "log2FoldChange", "padj")


# Slice and clean the data frame for input
lfcTable.comp <- lfcTable %>% 
    dplyr::select(GENEID, num.trans, Shrinkage, starts_with(my.param)) %>%  
    gather(Category, Value, -GENEID, -num.trans, -Shrinkage) %>% 
    separate(Category, c("Metric", "Input"), sep="_") %>% 
    pivot_wider(names_from=Input, values_from=Value) %>%
    nest(-Metric, -Shrinkage)  

# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.comp$data, 
                          ~cor(.x$TPM, .x$Counts)),
                  7)

# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec, 
                         lfcTable.comp$data, 
                         length(corr.vec))


# Unnest the data frame
lfcTable.comp <- lfcTable.comp %>% 
    unnest(data) 

# Add a column storing Rsquared labels 
lfcTable.comp$Rsquared.label <- rsq.vec


# Explore the cleaned data frame
head(lfcTable.comp)
## # A tibble: 6 x 7
##   Shrinkage Metric   GENEID             num.trans     TPM  Counts Rsquared.label
##   <chr>     <chr>    <chr>                  <int>   <dbl>   <dbl> <chr>         
## 1 None      baseMean ENSMUSG00000000037         9   120.    116.  "0.9999785"   
## 2 None      baseMean ENSMUSG00000000125         1    32.6    32.8 ""            
## 3 None      baseMean ENSMUSG00000000278         1  2046.   2060.  ""            
## 4 None      baseMean ENSMUSG00000000303         3    17.9    17.9 ""            
## 5 None      baseMean ENSMUSG00000000308        10   176.    177.  ""            
## 6 None      baseMean ENSMUSG00000000409         8 16478.  16588.  ""
# Nest the data frame by metric
lfcTable.comp <- lfcTable.comp %>%
    nest(-Metric) 
# Set a function creating a scatter plot
comp.scatter.fn <- function(df, 
                            xvar, 
                            yvar, 
                            met, 
                            xlabel, 
                            ylabel) {

    p <- ggplot(df, aes(x=xvar, 
                        y=yvar, 
                        color=log(num.trans), 
                        label=Rsquared.label)) + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Comparison in", met, "\n(with R-Squared)")) + 
        geom_text(size=5, 
                  mapping=aes(x=Inf, y=Inf), 
                  vjust=2, hjust=3.8, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black") + 
scale_color_gradient(low="blue", high="red") +
facet_wrap(~Shrinkage, scales="free") + 
theme(strip.text.x=element_text(size=10)) +
xlab(paste("Input from", xlabel)) + 
ylab(paste("Input from", ylabel))


return(p)
}




# Print the plots
for (i in 1:nrow(lfcTable.comp)) {

    df <- lfcTable.comp$data[[i]]

    
    df$Shrinkage <- factor(df$Shrinkage, levels=unique(df$Shrinkage))

    print(comp.scatter.fn(df,
                          df$TPM,
                          df$Counts,
                          lfcTable.comp$Metric[i], "TPM", "Counts"))
    
}

baseMean/LFC/FDR rank comparison between TPM and Count inputs

# Transform the data frame
lfcTable.rank <- lfcTable.comp %>% 
    unnest(data) %>%
    nest(-Metric, -Shrinkage)

# Clean and arrange the data frame
for (i in 1:length(lfcTable.rank$data)) {

    df <- lfcTable.rank$data[[i]] 

    # Create a vector storing rank (1 to the last)
    rank.vec <- 1:nrow(df)

    # Make descending order if the metric = "padj" and assigne rankings
    if (lfcTable.rank$Metric[[i]] == "padj") {

        df <- df %>%
            dplyr::arrange(TPM) %>% 
            mutate(rank.TPM=rank.vec) %>%
            dplyr::arrange(Counts) %>%
            mutate(rank.Counts=rank.vec) 

    # Make asending order otherwise and assign rankings
    } else {

        df <- df %>%
            dplyr::arrange(desc(abs(TPM))) %>%
            mutate(rank.TPM=rank.vec) %>% 
            dplyr::arrange(desc(abs(Counts))) %>%
            mutate(rank.Counts=rank.vec)

    }

    # Save the updated data frame 
    lfcTable.rank$data[[i]] <- df 



}

# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.rank$data, 
                          ~cor(.x$rank.TPM, .x$rank.Counts)),
                  7)

# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec, lfcTable.rank$data, length(corr.vec))

# Unnest the data frame
lfcTable.rank <- lfcTable.rank %>% 
    unnest(data) 

# Add a column storing Rsquared labels 
lfcTable.rank$Rsquared.label <- rsq.vec


# Explore the cleaned data frame
head(lfcTable.rank)
## # A tibble: 6 x 9
##   Metric  Shrinkage GENEID       num.trans    TPM Counts Rsquared.label rank.TPM
##   <chr>   <chr>     <chr>            <int>  <dbl>  <dbl> <chr>             <int>
## 1 baseMe… None      ENSMUSG0000…         3 72567. 73053. "0.9999016"           1
## 2 baseMe… None      ENSMUSG0000…         3 53754. 54115. ""                    2
## 3 baseMe… None      ENSMUSG0000…        11 53183. 53541. ""                    3
## 4 baseMe… None      ENSMUSG0000…         4 46583. 46896. ""                    4
## 5 baseMe… None      ENSMUSG0000…         1 43174. 43432. ""                    5
## 6 baseMe… None      ENSMUSG0000…         3 38943. 39207. ""                    6
## # … with 1 more variable: rank.Counts <int>
# Nest the data frame by metric
lfcTable.rank <- lfcTable.rank %>%
    nest(-Metric) 




# Print the plots
for (i in 1:nrow(lfcTable.rank)) {

    df <- lfcTable.rank$data[[i]]
    
    df$Shrinkage <- factor(df$Shrinkage, levels=unique(df$Shrinkage))

    pl <- comp.scatter.fn(df,
                          df$rank.TPM,
                          df$rank.Counts,
                          lfcTable.rank$Metric[i], 
                          "TPM", "Counts") + 
ggtitle(paste("Comparison in", lfcTable.rank$Metric[i], "Rank\n(with R-Squared)")) 

print(pl)

 
}

Summarizing up/down DEGs with an upset plot

- red bar: input type

- blue bar: directionality of gene expression change

# Set a function cleaning data to generate upset plots 
upset.input.fn <- function(df) {

    df <- df %>% 

        # Filter genes with valid padj 
        subset(!is.na(padj)) %>% 
        
        mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
               
               Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
               
               Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
               
               TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
               
               Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input? 

    # Create a list storing groups of interest
    upsetInput <- list(Up=df$Up, 
                       Down=df$Down, 
                       Unchanged=df$Unchanged, 
                       TPM_Input=df$TPM, 
                       Counts_Input=df$Counts) 

    return(upsetInput)

}

upsetList <- upset.input.fn(both.df)


# Create the upset plot 
upset(fromList(upsetList), 
      sets=names(upsetList),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red","blue", "blue", "blue"),
      text.scale = 1.5, number.angles=30) 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ashr_2.2-47                 apeglm_1.12.0              
##  [3] ensembldb_2.14.0            AnnotationFilter_1.14.0    
##  [5] GenomicFeatures_1.42.1      AnnotationDbi_1.52.0       
##  [7] UpSetR_1.4.0                pheatmap_1.0.12            
##  [9] gridExtra_2.3               DESeq2_1.30.1              
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0             
## [13] MatrixGenerics_1.2.0        matrixStats_0.58.0         
## [15] GenomicRanges_1.42.0        GenomeInfoDb_1.26.0        
## [17] IRanges_2.24.1              S4Vectors_0.28.1           
## [19] tximport_1.18.0             forcats_0.5.1              
## [21] stringr_1.4.0               dplyr_1.0.5                
## [23] purrr_0.3.4                 readr_1.4.0                
## [25] tidyr_1.1.3                 tibble_3.1.0               
## [27] ggplot2_3.3.3               tidyverse_1.3.0            
## [29] AnnotationHub_2.22.0        BiocFileCache_1.14.0       
## [31] dbplyr_2.1.0                BiocGenerics_0.36.0        
## [33] rmarkdown_2.7               data.table_1.14.0          
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                  backports_1.2.1              
##   [3] plyr_1.8.6                    lazyeval_0.2.2               
##   [5] splines_4.0.3                 BiocParallel_1.24.0          
##   [7] digest_0.6.27                 invgamma_1.1                 
##   [9] htmltools_0.5.1.1             SQUAREM_2021.1               
##  [11] fansi_0.4.2                   magrittr_2.0.1               
##  [13] memoise_2.0.0                 Biostrings_2.58.0            
##  [15] annotate_1.68.0               modelr_0.1.8                 
##  [17] askpass_1.1                   bdsmatrix_1.3-4              
##  [19] prettyunits_1.1.1             colorspace_2.0-0             
##  [21] blob_1.2.1                    rvest_0.3.6                  
##  [23] rappdirs_0.3.3                haven_2.3.1                  
##  [25] xfun_0.21                     crayon_1.4.1                 
##  [27] RCurl_1.98-1.2                jsonlite_1.7.2               
##  [29] genefilter_1.72.1             survival_3.2-7               
##  [31] glue_1.4.2                    gtable_0.3.0                 
##  [33] zlibbioc_1.36.0               XVector_0.30.0               
##  [35] DelayedArray_0.16.0           scales_1.1.1                 
##  [37] mvtnorm_1.1-1                 DBI_1.1.1                    
##  [39] Rcpp_1.0.6                    xtable_1.8-4                 
##  [41] progress_1.2.2                emdbook_1.3.12               
##  [43] bit_4.0.4                     truncnorm_1.0-8              
##  [45] httr_1.4.2                    RColorBrewer_1.1-2           
##  [47] ellipsis_0.3.1                farver_2.0.3                 
##  [49] pkgconfig_2.0.3               XML_3.99-0.5                 
##  [51] sass_0.3.1                    locfit_1.5-9.4               
##  [53] utf8_1.1.4                    labeling_0.4.2               
##  [55] tidyselect_1.1.0              rlang_0.4.10                 
##  [57] later_1.1.0.1                 munsell_0.5.0                
##  [59] BiocVersion_3.12.0            cellranger_1.1.0             
##  [61] tools_4.0.3                   cachem_1.0.4                 
##  [63] cli_2.3.1                     generics_0.1.0               
##  [65] RSQLite_2.2.3                 broom_0.7.5                  
##  [67] evaluate_0.14                 fastmap_1.1.0                
##  [69] yaml_2.2.1                    knitr_1.31                   
##  [71] bit64_4.0.5                   fs_1.5.0                     
##  [73] mime_0.10                     xml2_1.3.2                   
##  [75] biomaRt_2.46.3                compiler_4.0.3               
##  [77] rstudioapi_0.13               curl_4.3                     
##  [79] interactiveDisplayBase_1.28.0 reprex_1.0.0                 
##  [81] geneplotter_1.68.0            bslib_0.2.4                  
##  [83] stringi_1.5.3                 highr_0.8                    
##  [85] ps_1.5.0                      lattice_0.20-41              
##  [87] ProtGenerics_1.22.0           Matrix_1.3-2                 
##  [89] vctrs_0.3.6                   pillar_1.5.0                 
##  [91] lifecycle_1.0.0               BiocManager_1.30.10          
##  [93] jquerylib_0.1.3               irlba_2.3.3                  
##  [95] bitops_1.0-6                  httpuv_1.5.5                 
##  [97] rtracklayer_1.50.0            R6_2.5.0                     
##  [99] promises_1.2.0.1              MASS_7.3-53.1                
## [101] assertthat_0.2.1              openssl_1.4.3                
## [103] withr_2.4.1                   GenomicAlignments_1.26.0     
## [105] Rsamtools_2.6.0               GenomeInfoDbData_1.2.4       
## [107] hms_1.0.0                     grid_4.0.3                   
## [109] coda_0.19-4                   mixsqp_0.3-43                
## [111] bbmle_1.0.23.1                numDeriv_2016.8-1.1          
## [113] shiny_1.6.0                   lubridate_1.7.10